# -*- coding:UTF-8 -*-
"""
@Project:   DataCrawler
@FileName:  second_floor.py
@Create:    2023/5/3 18:00  
@Author:    Jia  
@Descr:     ->爬取天眼查第二层数据

-------------------------------------------------------
数据结构：
redis 集合              redis 哈希
-------------------------------------------------------
is_crawled  已爬        first_floor_data  compy:url
no_crawled  未爬
new_urls    新URL
fail_urlS   失败URL
first_floor_url  爬取第一层数据的URL
--------------------------------------------------------
"""
from Common.url_manager import UrlManager
from Common.downloader import HtmlDownloader
from Common.html_parser import HtmlParser
from Common.output import Output
import traceback
from Config.log import Logs
from database import redis_db
import asyncio
import time
global url
global count
logger = Logs().debug_logger()
rds = redis_db.redis_conn2(logger)


class SecondFloor:
    def __init__(self):
        self.start_time = time.perf_counter()
        self.urls = UrlManager(rds)
        self.downloader = HtmlDownloader(logger)
        self.parser = HtmlParser(rds)
        self.export = Output(rds)

    async def crawling(self):
        """
        读取第一层数据开始爬取第二层数据
        """
        global url
        global count
        count = 1

        # 将第一层数据的URL添加到未爬取
        # for iterm in rds.sscan_iter('first_floor_urls'):
        #     rds.sadd('second_no_crawled', iterm)

        # 有未爬取的URL时
        while self.urls.second_has_crawled():
            try:

                # 从待爬取集合中取一个URL
                url = await self.urls.second_get_url()

                # 协程异步下载器下载页面
                html_content = await self.downloader.async_download(url)

                # 用selenium解析爬取不到的第二层数据
                # await self.parser.open_web(url)

                # 页面解析第二层数据
                await self.parser.parser_second_floor(html_content)

                # 将新待爬URL加入redis no_crawled集合中
                # await self.urls.add_new_urls()

                task1 = asyncio.create_task(self.urls.second_get_url())
                task2 = asyncio.create_task(self.downloader.async_download(await task1))
                # task3 = asyncio.create_task(self.parser.open_web(url))
                task4 = asyncio.create_task(self.parser.parser_second_floor(await task2))

                await asyncio.gather(task1, task2, task4)

                # 集合所有爬取的数据
                # self.export.collect_data(new_data)

                if count == 1000:
                    break

            except KeyboardInterrupt:
                logger.debug('键盘手动中断')
                break
            except (KeyError, AttributeError):
                logger.debug(f'没有找到元素')
                count += 1
                continue
            except Exception:
                logger.info(f'crawling fail:{traceback.print_exc()}')
                rds.sadd('second_fail_url', url)
                count += 1
            else:
                logger.info(f'第{count}个爬取成功')
                count += 1
        # 以HTML的形式输出爬取的数据
        # self.export.export_html()

        # 关闭数据库连接
        end_time = time.perf_counter()
        use_time = end_time - self.start_time
        logger.info(f'本次运行共用时{round(use_time, 2)}秒')


if __name__ == '__main__':
    s = SecondFloor()
    app = s.crawling()
    asyncio.run(app)


